##########################################
### Working with data in R
### Samantha Zuhlke
###
### This R script reviews how to: 
### - import/merge data in R
### - basic data visualization
### - calculate sample statistics
##########################################

##########################################
# begin preamble

# install necessary packages
# install.packages("foreign")
# install.packages("readstata13")
# install.packages("ggplot2")
# install.packages("lattice")
# install.packages("car")
# install.packages("dplyr")
# install.packages("ggplot2")

# require necessary packages
require(graphics)
require(MASS)
require(lattice)
require(base)
require(dplyr)
require(ggplot2)

# set working directory 
getwd()
setwd("/Users/samanthazuhlke/Desktop/R Workspace/POLS_309") 
# Two reminders: 
##### everyone's working directory will be different.
##### As a reminder, every R script needs a preamble.

# Note, you may want to use the "here" package with students to set workspaces, 
## which makes setting workspaces much easier. 

# end preamble
##########################################
# clear global environment
rm(list=ls())

# Note, it's not best practice to begin your R script with rm(list=ls()), 
#   because it only clears objects in your Global Environment and not other settings.
# It's better to start a new R session when you run a new script.
# However, I find that impractical when teaching multiple tutorials in a lab, 
#   and so use rm(list=ls()) in tutorials when moving between scripts.
# If you do the same, be sure to flag this for your students that it isn't best practice!

##########################################
# Working with data 

# import data
data_a <- read.csv("diamonds_sample_a.csv")
View(data_a)

data_b <- read.csv("diamonds_sample_b.csv")
View(data_b)

# merging data 
# find a variable in common. What is the common variable between data_a and data_b? 

data <- merge(data_a, data_b, by="obs")
# merge(data set 1, data set 2, by = "name of the common variable")
View(data)

# native data to R packages 
# "diamonds" is a native dataset in package "ggplot2"
# about the dataset: https://ggplot2.tidyverse.org/reference/diamonds.html
# create a dataset called "data" using the diamonds data 
data(diamonds) 
View(diamonds)

# the rest of this tutorial will use a sample from the diamonds data 
rm(list=ls())
data(diamonds)

##########################################
# summary statistics 

# first thing, look at your data. 
View(diamonds)

# recall: what are the 4 types of data?
# nominal, ordinal, interval, ratio
View(diamonds)
# what measurement type is: 
## carat?
## cut? 
## color? 
## price?

# Histograms
qplot(x=carat, data=diamonds)
qplot(x=price, data=diamonds)
qplot(x=depth,data=diamonds)

hist(diamonds$carat)
hist(diamonds$price)
hist(diamonds$depth)

# Bar Plots
qplot(x=cut,data=diamonds)

# Box plots  (geom == "the style of plot I want")
qplot(factor(cut), price, data = diamonds, geom = c("boxplot"))

# Jitter Plots
qplot(factor(cut), price, data = diamonds, geom = c("jitter"))

# Violin plots
qplot(factor(cut),price, data = diamonds, geom = c("violin"))

# Exporting/Saving Graphics
# There are multiple ways to export graphs in R so they may be used in other programs.
# R can create PDFs(*best option usually), .wmf, .png, JPEGS, .bmp, and .ps files. 
# code
pdf(file="ScatterPlot.pdf") #Tells R to make a PDF of what comes next
qplot(y=price,x=carat,data=diamonds)
dev.off() #Tells R to finish making the PDF.
# or Export > Save as Image.

# How to calculate summary statistics 

summary(diamonds)

# examine our data and variables of interest: carat, price, and cut
summary(diamonds$price)
summary(diamonds$carat)
summary(diamonds$cut)
# different types of data = different summary statistics


# We can also individually ask for each of these summary statistics
# The mean
mean(diamonds$carat)
mean(diamonds$price)
mean(diamonds$cut) # why do we get an error term?

# median
median(diamonds$price)

# minimum
min(diamonds$price)

# maximum
max(diamonds$price)

# variance 
var(diamonds$price)

# standard deviation 
sd(diamonds$price)

# test relationship between variance and standard deviation
var <- var(diamonds$price)
sd <- sqrt(var)
sd

############################
# HELP FUNCTION
# You can usually ask Google, use the Help window, or type directly into the console
# by putting a ? in front of the command or by typing help.
# Ex. 
?get(wd)
help(plot)